Items from Key Terms

Refer to review slides for definitions.

Mode

getModes(mtcars$cyl)
## [1] 8

Frequency Distribution and Histograms

Categorical Data

table(iris$Species) %>% data.frame() %>% gt() %>% cols_label(Var1 = 'Species')
Species Freq
setosa 50
versicolor 50
virginica 50
ggplot(iris, aes(Species))+ 
  geom_histogram(stat = "count", col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Plant Species')+
  theme_bw()

Quantatitive Data

bins <- seq(10,34,by = 2)

mpg <- cut(mtcars$mpg,bins)

freqDist <- table(mpg) %>% data.frame()
gt(freqDist)
mpg Freq
(10,12] 2
(12,14] 1
(14,16] 7
(16,18] 3
(18,20] 5
(20,22] 5
(22,24] 2
(24,26] 2
(26,28] 1
(28,30] 0
(30,32] 2
(32,34] 2
# Binning using ggplot
ggplot(mtcars, aes(mpg))+ 
  geom_histogram(binwidth = 2,col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Miles Per Gallon', caption = "1974 Motor Trend US Magazine")+
  theme_bw()

# Using manual binning method
ggplot(freqDist, aes(mpg,Freq))+ 
  geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Miles Per Gallon',
       caption = "Data is from 1974 Motor Trend US Magazine",
       y = 'Frequency',
       x= "Miles Per Gallon")+
  theme_bw()

Unimodal Example

#Symmetric

rating <- c(rep(1,10), rep(2,20), rep(3,30),rep(4,20),rep(5,10))

freqDist <- table(rating) %>% data.frame()
gt(freqDist)
rating Freq
1 10
2 20
3 30
4 20
5 10
ggplot(freqDist, aes(rating,Freq))+ 
  geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Ratings for Local Restaurant')+
  theme_bw()

#Asymmetric

rating <- c(rep(1,10), rep(2,20), rep(3,30),rep(4,60),rep(5,35))

freqDist <- table(rating) %>% data.frame()
gt(freqDist)
rating Freq
1 10
2 20
3 30
4 60
5 35
ggplot(freqDist, aes(rating,Freq))+ 
  geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Ratings for Local Restaurant')+
  theme_bw()

Bimodal Example

#Symmetric

rating <- c(rep(1,20), rep(2,40), rep(3,10),rep(4,40),rep(5,20))

freqDist <- table(rating) %>% data.frame()
gt(freqDist)
rating Freq
1 20
2 40
3 10
4 40
5 20
ggplot(freqDist, aes(rating,Freq))+ 
  geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Ratings for Local Restaurant')+
  theme_bw()

#Asymmetric

rating <- c(rep(1,30), rep(2,10), rep(3,30),rep(4,15),rep(5,5))

freqDist <- table(rating) %>% data.frame()
gt(freqDist)
rating Freq
1 30
2 10
3 30
4 15
5 5
ggplot(freqDist, aes(rating,Freq))+ 
  geom_col(col = 'black', fill = 'darkblue', alpha = 0.75)+
  labs(title = 'Distribution of Ratings for Local Restaurant')+
  theme_bw()

Summary Statistics

# Base
summary(mtcars %>% select(mpg, cyl,hp))
##       mpg             cyl              hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :335.0
#Upgraded
mtcars%>% select(mpg, cyl,hp) %>% 
  tbl_summary(statistic = list(all_continuous() ~ c("{mean} ({sd})",
                                                    "{median} ({p25}, {p75})",
                                                    "{min}, {max}"),
                              all_categorical() ~ "{n} / {N} ({p}%)"),
              type = all_continuous() ~ "continuous2"
  )
Characteristic N = 32
mpg
Mean (SD) 20.1 (6.0)
Median (IQR) 19.2 (15.4, 22.8)
Range 10.4, 33.9
cyl
4 11 / 32 (34%)
6 7 / 32 (22%)
8 14 / 32 (44%)
hp
Mean (SD) 147 (69)
Median (IQR) 123 (96, 180)
Range 52, 335

Box Plot

Sometimes it is easier to view this data in a box plot. Potential outliers are recognized outside the upper and lower fence. The upper and lower fences are calculated as follows:

\[Upper Fence = Q3 + (1.5 * IQR)\] \[Lower Fence = Q1 – (1.5 * IQR)\]

plot_ly(y = starwars$height, type = 'box', name = 'Height [cm]',text = starwars$name) %>% 
  layout(title = 'Distribution of Star Wars Character Heights')

Scatter Plots

plot_ly(starwars, y = ~mass, x = ~height, type = 'scatter',text = ~name)
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Time Series

?sp500

ggplot(sp500, aes(date, high))+
  geom_line(color = 'gray40',alpha = 0.75)+
  geom_smooth(method = "lm", se = F, color = 'darkblue', linetype = 'dashed')+
  theme_bw()+
  labs(title = "Trading Volume of the S&P 500 Over Time",
       subtitle = 'from 1950-2015',
       x = "Date",
       y = "Volume")
## `geom_smooth()` using formula 'y ~ x'